Contributing authors:

Carmen Lia Murall, Raphaël Poujol, Susanne Kraemer, Art Poon, Jesse Shaprio

Introduction

This notebook is to explore Canadian SARS-CoV-2 genomic and epidemiological data, for discussion with pillar 6’s team and for sharing with collaborators. These analyses can spur further research within or across pillars, be used for reports (or data dashboards), given to the science communication pillar for public dissemination, and the code can be repackaged to give to public health authorities for their internal use.

Canadian genomic and epidemiological data will be regularly pulled from various public sources to keep these analyses up-to-date. We would like to acknoweldge all Canadian laboratories across the country that are making these data publically available. Only representations of aggregate data will be posted here. Below is a list of data sources and lab authors.

## 1. LOAD processed metadata of Canadian sequences (with latest pangolin, division, and full seq IDs)
#Download metadata from gisaid, put the date here:
gisaiddate="2022_01_17"
#date=2022_01_17
#tar -axf metadata_tsv_$date.tar.xz metadata.tsv -O | tr ' ' '_'  | sed 's/\t\t/\tNA\t/g' | sed 's/\t\t/\tNA\t/g' | sed 's/\t$/\tNA/g' | awk 'NR==1 || substr($1,9,6)=="Canada" && $8=="Human"' | sort -k3,3 > metadata_CANall_$date.uncorrected.csv 
#cat metadata_virrusseq_2022_01_17.tsv | tr ' ' '_'  | sed 's/\t\t/\tNA\t/g' | sed 's/\t\t/\tNA\t/g' | sed 's/\t$/\tNA/g' | awk 'NR!=1 && $43!="NA"' | cut -f5,43 | sort -k2,2 > epidatesfromvirrusseq
#join -1 3 metadata_CANall_$date.uncorrected.csv -a 1 -2 2 epidatesfromvirrusseq | awk '$4!=$23 && length($4)<10 && length($23)==10{$4=$23} {id=$1;$1=$2;$2=$3;$3=id} {print}' | tr ' ' '\t'| cut -f-22 > metadata_CANall_$date.csv
#zip CoVaRRNet_pillar6notebook/data_needed/metadata_CANall_$date.zip metadata_CANall_$date.csv

metaCANall <- read.csv(unz(paste("./data_needed/metadata_CANall_",gisaiddate,".zip",sep=""),paste("metadata_CANall_",gisaiddate,".csv",sep="")),sep="\t",row.names=NULL)
metaCANall$Collection_date <- as.Date(metaCANall$Collection_date)
#max(metaCANall$Collection.date) - min(metaCANall$Collection.date) #time diff: 580 days

#make a pango.group column

metaCANall$pango.group <- metaCANall$Variant
metaCANall$pango.group[is.na(metaCANall$pango.group)]<-"other"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Alpha_202012/01_GRY_(B.1.1.7+Q.x)_first_detected_in_the_UK"]<-"Alpha"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Beta_GH/501Y.V2_(B.1.351+B.1.351.2+B.1.351.3)_first_detected_in_South_Africa" ]<-"Beta"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Gamma_GR/501Y.V3_(P.1+P.1.x)_first_detected_in_Brazil/Japan"   ]<-"Gamma"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Delta_GK/478K.V1_(B.1.617.2+AY.x)_first_detected_in_India"   ]<-"Delta"
metaCANall$pango.group[metaCANall$Pango_lineage == "AY.25" ]<-"Delta AY.25"
metaCANall$pango.group[metaCANall$Pango_lineage == "AY.27" ]<-"Delta AY.27"
metaCANall$pango.group[metaCANall$pango.group == "VOI_Lambda_GR/452Q.V1_(C.37+C.37.1)_first_detected_in_Peru"  ]<-"Lambda"
metaCANall$pango.group[metaCANall$pango.group == "VOC_Omicron_GRA_(B.1.1.529+BA.*)_first_detected_in_Botswana/Hong_Kong/South_Africa" ]<-"Omicron"
metaCANall$pango.group[metaCANall$pango.group == "VOI_Mu_GH_(B.1.621+B.1.621.1)_first_detected_in_Colombia" ]<-"Mu"
metaCANall$pango.group[str_detect(metaCANall$pango.group,"V") ]<-"other"
metaCANall$pango.group[metaCANall$Pango_lineage == "A.23.1" ]<-"A.23.1"
metaCANall$pango.group[metaCANall$Pango_lineage == "B.1.438.1" ]<-"B.1.438.1"
metaCANall$pango.group[grepl("B\\.1\\.1\\.529|BA\\.", metaCANall$Pango_lineage)] <- "Omicron"

## 2. LOAD epidemiological data (PHAC)



epidataCANall <- read.csv(url("https://health-infobase.canada.ca/src/data/covidLive/covid19-download.csv"))
#from: https://health-infobase.canada.ca/covid-19/epidemiological-summary-covid-19-cases.html?stat=num&measure=total&map=pt#a2
epidate = tail(epidataCANall,1)$date
#epidataAlberta <- epidataCANall[ which(epidataCANall$prnameFR=='Alberta'), ]
#print(epidataAlberta$numtoday)

Snapshot: SARS-CoV-2 in Canada

Variants in Canada

Sequence counts for all Canadian data in GISAID, up to 2022-01-20, shows that by end of summer Alpha and Gamma were still the most sequenced variants. Note that some of the “other” category include some Delta sublineages (AYs) but overall, from the beginning of the pandemic to the fall of 2021, Canadian sequences were mostly of the wildtype lineages (pre-VOCs).

# --- Histogram plot: counts per variant of all canadian data -------------


listpango <- unique(metaCANall$pango.group)
print(listpango)
##  [1] "other"       "B.1.438.1"   "A.23.1"      "Alpha"       "Beta"       
##  [6] "Gamma"       "Delta"       "Lambda"      "Delta AY.27" "Delta AY.25"
## [11] "Mu"          "Omicron"
listpango <- listpango[order(listpango)]
getpal <- colorRampPalette(brewer.pal(9, "Paired")) #"Set3
pal <- getpal(length(listpango))
names(pal) <- listpango
pal["Omicron"]="#E52823"
pal["other"]="grey"
pal["Alpha"]="#B29C71"
pal["Delta"]="#A6CEE3"
pal["Delta AY.27"]="#438FC0"
pal["Delta AY.25"]="#61A6A0" 
pal["A.23.1"]="#9AD378"
pal["B.1.438.1"]="#3EA534"
pal["Beta"]="#CAB2D6"
pal["Gamma"]="#444444"
pal["Lambda"]="#F08C3A"
pal["Mu"]="#F08C3A"

#print(pal)

#------------- counts over time (by week)

plot_metadatadf <- function(meta_tab,pal) {
  meta_tab <- filter(meta_tab, !is.na(meta_tab$Collection_date))
  df1 <- meta_tab %>% group_by(pango.group) %>% group_by(week = cut(Collection_date, "week")) #adds week coloum
  dfcount <- df1 %>% group_by(week) %>% count(pango.group)
  #plot
  ggplot(dfcount, aes(x=as.Date(week), y= n, fill=pango.group))+geom_bar(stat = "identity")+
    scale_fill_manual(breaks=listpango, values = pal)+ ylab("")+xlab("")+
    scale_x_date(date_breaks = "28 day")+
    theme_classic()+theme(axis.text.x = element_text(angle=90, vjust=0.1,hjust=0.1))
  
}
  
plot_metadatadf_freq <- function(meta_tab,pal) {
  meta_tab <- filter(meta_tab, !is.na(meta_tab$Collection_date))
  df1 <- meta_tab %>% group_by(pango.group) %>% group_by(week = cut(Collection_date, "week")) #adds week coloum
  dfcount <- df1 %>% group_by(week) %>% count(pango.group)
  dfcount %>% mutate(freq = prop.table(n)) -> dfprop
  
  #plot
  ggplot(dfprop, aes(x= as.Date(week), y=freq, fill=pango.group))+geom_bar(stat = "identity")+
    scale_fill_manual(breaks=listpango, values = pal)+ ylab("")+xlab("")+
    scale_x_date(date_breaks = "28 day")+
    theme_classic()+theme(axis.text.x = element_text(angle=90, vjust=0.1,hjust=0.1))
}


plot_metadatadf(metaCANall,pal)

plot_metadatadf_freq(metaCANall,pal)

There are two PANGO lineages that have a Canadian origin and have predominately spread within Canada (with some exportations internationally): B.1.438.1 and B.1.1.176.

Other lineages of Canadian interest:

  • A.2.5.2 - an A lineage (clade 19B) that spread in Quebec, involved in several outbreaks, before Delta arrived
  • B.1.2 - an American (USA) lineage that spread well in Canada
  • B.1.160 - an European lineages that spread well in Canada

Provinces

Here we show the same plots but for each provinces

loc=sapply(strsplit(metaCANall$Location,"_/_"), `[`, 3)
metaCANall$province <- loc
print(unique(metaCANall$province))
##  [1] "Quebec"                    "Ontario"                  
##  [3] "Newfoundland_and_Labrador" "Nova_Scotia"              
##  [5] "New_Brunswick"             "Manitoba"                 
##  [7] "Alberta"                   "British_Columbia"         
##  [9] "Saskatchewan"              NA                         
## [11] "Newfoundland"

British_Columbia

plot_metadatadf(metaCANall[ which(metaCANall$province=='British_Columbia'), ],pal)

plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='British_Columbia'), ],pal)

Alberta

plot_metadatadf(metaCANall[ which(metaCANall$province=='Alberta'), ],pal)

plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Alberta'), ],pal)

Saskatchewan

plot_metadatadf(metaCANall[ which(metaCANall$province=='Saskatchewan'), ],pal)

plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Saskatchewan'), ],pal)

Manitoba

plot_metadatadf(metaCANall[ which(metaCANall$province=='Manitoba'), ],pal)

plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Manitoba'), ],pal)

Ontario

plot_metadatadf(metaCANall[ which(metaCANall$province=='Ontario'), ],pal)

plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Ontario'), ],pal)

Quebec

plot_metadatadf(metaCANall[ which(metaCANall$province=='Quebec'), ],pal)

plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Quebec'), ],pal)

Nova_Scotia

plot_metadatadf(metaCANall[ which(metaCANall$province=='Nova_Scotia'), ],pal)

plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='Nova_Scotia'), ],pal)

New_Brunswick

plot_metadatadf(metaCANall[ which(metaCANall$province=='New_Brunswick'), ],pal)

plot_metadatadf_freq(metaCANall[ which(metaCANall$province=='New_Brunswick'), ],pal)

Newfoundland_and_Labrador

subtab=metaCANall[ which(metaCANall$province=='Newfoundland' | metaCANall$province=='Newfoundland_and_Labrador' ), ]
plot_metadatadf(subtab,pal)

plot_metadatadf_freq(subtab,pal)

Canadian trees

Down-sampled Canadian SARS-CoV-2 genomes. Taken from GISAID Sept. 12th, 2021. Alignment GISAID, ML tree in IQ-tree. This tree was generated using TreeTime and visualized in ggtree.

This tree shows several features of VOC spread in Canada:

  • Alpha spread across most of Canada
  • Gamma was mostly located in BC
  • Alberta represents most of the Delta cases
  • Expansion of Alpha and Gamma coincide with a decrease in detection of wildtype variants
  • Delta is the bulk of what was being sequenced at the end of the summer

These last two points are suggestive of strain (variant) replacement, i.e. competitive exclusion, but more detailed analyses would be required to clarify this.

Divergence tree from TreeTime run, visualized in ggtree.

Evolution and growth rates of SARS-CoV-2 in Canada

There are various methods to investigate changes in evolutionary rates of VOC/VOIs and compare their relative fitness in an epidemiological context.

For example, root-to-tip plots give estimates of substitution rates. Clusters with stronger positive slopes than the average SARS-CoV-2 dataset, are an indication that they are accumlating mutations at a faster pace, or clusters that jump up above the average could indicate a mutational jump (simlar to what we saw with Alpha when it first appeared in the UK).

Maximum likelihood tree (IQ-TREE) processed with root-to-tip regression and plotting in R.

rooted <- read.tree('./data_needed/msa_0908_CANall_downsamp10perday_1.rtt.nwk')
get.dates <- function(phy, delimiter='_', pos=-1, format='%Y-%m-%d') {
  dt <- sapply(phy$tip.label, function(x) {
    tokens <- strsplit(x, delimiter)[[1]]
    if (pos < 0) { return(tokens[length(tokens)+pos+1]) }
    else { return(tokens[pos]) }
  })
  as.Date(dt, format=format)
}
tip.dates <- get.dates(rooted, pos=-2)

# total branch length from root to each tip
div <- node.depth.edgelength(rooted)[1:Ntip(rooted)]

# match tips to metadata to retrieve PANGO lineage assignments
accno <- gsub(".+_(EPI_ISL_[0-9]+)_.+", "\\1", rooted$tip.label)
index <- match(accno, metaCANall$Accession_ID)
pg <- metaCANall$pango.group[index]

blobs <- function(x, y, col, cex=1) {
  points(x, y, pch=21, cex=cex)
  points(x, y, bg=col, col=rgb(0,0,0,0), pch=21, cex=cex)
}
dlines <- function(x, y, col) {
  lines(x, y, lwd=2.5)
  lines(x, y, col=col)
}

vocs <- c('Alpha', 'Beta', 'Gamma', 'Delta', 'Omicron')
pal <- hcl.colors(n=length(vocs), palette="Sunset")

par(mar=c(5,5,0,1))
plot(tip.dates, div, type='n', las=1, cex.axis=0.6, cex.lab=0.7, bty='n',
     xaxt='n', xlab="Sample collection date", ylab="Divergence from root")
xx <- floor_date(seq(min(tip.dates), max(tip.dates), length.out=5), unit="months")
axis(side=1, at=xx, label=format(xx, "%b %Y"), cex.axis=0.6)

blobs(tip.dates[pg=='other'], div[pg=='other'], col='grey', cex=0.8)
fit0 <- rlm(div[pg=='other'] ~ tip.dates[pg=='other'])
abline(fit0, col='gray50')
fits <- list(other=fit0)

for (i in 1:length(vocs)) {
  variant <- vocs[i]
  x <- tip.dates[pg==variant]
  if (all(is.na(x))) next
  y <- div[pg==variant]
  blobs(x, y, col=pal[i], cex=0.8)
  fit <- rlm(y ~ x)
  dlines(fit$x[,2], predict(fit), col=pal[i])
  fits[[variant]] <- fit
}

legend(x=min(tip.dates), y=max(div), legend=vocs, pch=21, pt.bg=pal, bty='n', cex=0.8)

ci <- lapply(fits, confint.default)
kable(data.frame(
  n=sapply(fits, function(f) nrow(f$x)),                            
  est=29970*sapply(fits, function(f) f$coef[2]),
  lower.95=29970*sapply(ci, function(f) f[2,1]),
  upper.95=29970*sapply(ci, function(f) f[2,2])
), 
col.names = c("Number of genomes", "Estimate", "Lower 95% CI", "Upper 95% CI"),
format="html", align="rrrr", caption="Molecular clock rates (subs/genome/day)",
format.args = list(scientific = FALSE), digits=4, table.attr = "style='width:100%;'")
Molecular clock rates (subs/genome/day)
Number of genomes Estimate Lower 95% CI Upper 95% CI
other 3284 0.0478 0.0464 0.0492
Alpha 628 0.0377 0.0312 0.0442
Beta 16 0.0126 -0.0046 0.0299
Gamma 289 0.0597 0.0508 0.0686
Delta 181 0.0671 0.0489 0.0853

Overall, the evolutionary rate among genomes sampled in Canada (0.0678102 subs/genome/day, grey line) is close to the global average of 0.0674941 subs/genome/day. Compared to other lineages sampled in Canada, variant of concern Alpha (B.1.1.7) exhibited a slightly but significantly lower rate of evolution. Both variants of concern Gamma (P.1) and Delta (B.1.617.2) exhibited higher rates, although only Gamma was significantly higher.

Add here:

  • growth rates (Sally’s code)
  • dN/dS (by variant and by gene/domains)
  • Tajima’s D
  • other? (e.g. PyR0, nnet R, ?)
  • Delta and its sublineages (specific plots as above, also delta w/+w/o E484K in Canada/PTs)

Are there any BEAST analyses we’d like to do? e.g. infer R0, serial interval, etc for the different Delta sublineages (might have to restrict this geographically to specific PTs with enough coverage)

List of useful tools

Collect a list of bioinformatics, phylogenetic, and modelling tools that are useful for SARS-CoV-2 analyses:

Acknowledgements and Data sources

We thank all the authors, developers, and contributors to the GISAID and VirusSeq database for making their SARS-CoV-2 sequences publicly available. We thank especially the Canadian Public Health Laboratory Network, academic sequencing partners, diagnostic hospital labs, and other sequencing partners for the provision of the Canadian sequence data used in this work. Genome sequencing in Canada was supported by a Genome Canada grant to the Canadian COVID-19 Genomic Network (CanCOGeN).

Session info

The version numbers of all packages in the current environment as well as information about the R install is reported below.

Hide

Show

sessionInfo()
## R version 4.1.1 (2021-08-10)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22000)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Canada.1252  LC_CTYPE=English_Canada.1252   
## [3] LC_MONETARY=English_Canada.1252 LC_NUMERIC=C                   
## [5] LC_TIME=English_Canada.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] MASS_7.3-54        viridis_0.6.1      viridisLite_0.4.0  colorspace_2.0-2  
##  [5] RColorBrewer_1.1-2 kableExtra_1.3.4   gridExtra_2.3      ggbeeswarm_0.6.0  
##  [9] DT_0.19            cowplot_1.1.1      ggtree_3.0.4       phytools_0.7-90   
## [13] maps_3.4.0         phangorn_2.7.1     tidytree_0.3.5     phylotools_0.2.2  
## [17] ape_5.5            treeio_1.16.2      lubridate_1.7.10   reticulate_1.22   
## [21] knitr_1.36         forcats_0.5.1      stringr_1.4.0      dplyr_1.0.7       
## [25] purrr_0.3.4        readr_2.0.2        tidyr_1.1.4        tibble_3.1.4      
## [29] ggplot2_3.3.5      tidyverse_1.3.1   
## 
## loaded via a namespace (and not attached):
##  [1] ellipsis_0.3.2          fs_1.5.0                aplot_0.1.1            
##  [4] rstudioapi_0.13         farver_2.1.0            fansi_0.5.0            
##  [7] xml2_1.3.2              codetools_0.2-18        mnormt_2.0.2           
## [10] jsonlite_1.7.2          broom_0.7.9             dbplyr_2.1.1           
## [13] png_0.1-7               compiler_4.1.1          httr_1.4.2             
## [16] backports_1.2.1         assertthat_0.2.1        Matrix_1.3-4           
## [19] fastmap_1.1.0           lazyeval_0.2.2          cli_3.0.1              
## [22] htmltools_0.5.2         tools_4.1.1             igraph_1.2.6           
## [25] coda_0.19-4             gtable_0.3.0            glue_1.4.2             
## [28] clusterGeneration_1.3.7 fastmatch_1.1-3         Rcpp_1.0.7             
## [31] cellranger_1.1.0        jquerylib_0.1.4         vctrs_0.3.8            
## [34] svglite_2.0.0           nlme_3.1-153            xfun_0.26              
## [37] rvest_1.0.1             lifecycle_1.0.1         scales_1.1.1           
## [40] hms_1.1.1               parallel_4.1.1          expm_0.999-6           
## [43] yaml_2.2.1              ggfun_0.0.4             yulab.utils_0.0.2      
## [46] sass_0.4.0              stringi_1.7.4           highr_0.9              
## [49] plotrix_3.8-2           rlang_0.4.11            pkgconfig_2.0.3        
## [52] systemfonts_1.0.2       evaluate_0.14           lattice_0.20-45        
## [55] labeling_0.4.2          patchwork_1.1.1         htmlwidgets_1.5.4      
## [58] tidyselect_1.1.1        magrittr_2.0.1          R6_2.5.1               
## [61] generics_0.1.0          combinat_0.0-8          DBI_1.1.1              
## [64] pillar_1.6.3            haven_2.4.3             withr_2.4.2            
## [67] scatterplot3d_0.3-41    modelr_0.1.8            crayon_1.4.1           
## [70] utf8_1.2.2              tmvnsim_1.0-2           tzdb_0.1.2             
## [73] rmarkdown_2.11          grid_4.1.1              readxl_1.3.1           
## [76] reprex_2.0.1            digest_0.6.28           webshot_0.5.2          
## [79] numDeriv_2016.8-1.1     gridGraphics_0.5-1      munsell_0.5.0          
## [82] beeswarm_0.4.0          ggplotify_0.1.0         vipor_0.4.5            
## [85] bslib_0.3.0             quadprog_1.5-8